library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace) ; library(corrplot)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(foreach) ; library(doParallel)
library(knitr) ; library(kableExtra) ; library(xtable)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
# Get colors from the ggplot palette
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n+1)
pal = hcl(h = hues, l = 65, c = 100)[1:n]
}
# Assign an HCL rainbow colour to each module
get_mod_colours = function(mods){
n = length(unique(mods))-1
set.seed(123) ; rand_order = sample(1:n)
mod_colors = c('white', gg_colour_hue(n)[rand_order])
names(mod_colors) = mods %>% table %>% names
return(mod_colors)
}
# Compare results from GSEA and ORA
compare_methods = function(GSEA_list, ORA_list, top_modules_enrichment, top_modules, database){
for(module in top_modules){
cat(paste0(' \n \n Enrichment results for cluster ',
genes_info$module_number[genes_info$Module==module][1], ': \n'))
cat(paste0('- GSEA has ', nrow(GSEA_list[[module]][[database]]@result), ' enriched term(s) \n'))
cat(paste0('- ORA has ', nrow(ORA_list[[module]][[database]]@result), ' enriched term(s) \n'))
cat(paste0('- ', nrow(top_modules_enrichment[[module]][[database]]),
' terms are enriched in both methods \n \n'))
enriched_terms = top_modules_enrichment[[module]][[database]] %>%
dplyr::select(ID, Description.x, p.adjust_ORA, p.adjust_GSEA, qvalue_ORA, GeneRatio) %>%
dplyr::rename('Description' = Description.x)
if(nrow(enriched_terms)>0){
print(enriched_terms %>% mutate(pval_mean = p.adjust_ORA + p.adjust_GSEA) %>%
arrange(pval_mean) %>% dplyr::select(-pval_mean) %>%
kable %>% kable_styling(full_width = F))
##########################################################################################################
# Get genes involved
genes = c()
i=1
for(row_genes in top_modules_enrichment[[module]][[database]] %>% pull(geneID)){
genes = c(genes, strsplit(row_genes,'/') %>% unlist) %>% unique
if(i==5){
cat(paste0('Genes involved in top 5 enriched terms: ',
paste(gene_names %>% filter(entrezgene %in% genes) %>% pull(hgnc_symbol) %>% unique %>%
sort, collapse = ', '),'\n'))
}
i = i+1
}
if(i != 5){
genes = gene_names %>% filter(entrezgene %in% genes) %>% pull(hgnc_symbol) %>% unique %>% sort
cat(paste0('\nGenes involved in all enriched terms: ', paste(genes, collapse = ', ')))
}
##########################################################################################################
}
}
}
plot_results = function(top_modules_enrichment, top_modules, database){
l = htmltools::tagList()
for(i in 1:length(top_modules)){
plot_data = top_modules_enrichment[[top_modules[i]]][[database]] %>%
dplyr::rename('Description' = Description.x)
if(nrow(plot_data)>5){
min_val = min(min(plot_data$p.adjust_GSEA), min(plot_data$p.adjust_ORA))
max_val = max(max(max(plot_data$p.adjust_GSEA), max(plot_data$p.adjust_ORA)), 0.05)
ggp = ggplotly(plot_data %>% ggplot(aes(p.adjust_GSEA, p.adjust_ORA, color = NES)) +
geom_point(aes(id = Description)) +
geom_vline(xintercept = 0.05, color = 'gray', linetype = 'dotted') +
geom_hline(yintercept = 0.05, color = 'gray', linetype = 'dotted') +
ggtitle(paste0('Enriched terms in common for cluster ',
genes_info$module_number[genes_info$Module==top_modules[i]][1])) +
scale_x_continuous(limits = c(min_val, max_val)) +
scale_y_continuous(limits = c(min_val, max_val)) +
xlab('Corrected p-value for GSEA') + ylab('Corrected p-value for ORA') +
scale_colour_viridis(direction = -1) + theme_minimal() + coord_fixed())
l[[i]] = ggp
}
}
return(l)
}
# plot_shared_genes(top_modules_enrichment, top_modules, 'GO')
plot_shared_genes = function(top_modules_enrichment, top_modules, database){
for(tm in 1:length(top_modules)){
plot_data = top_modules_enrichment[[top_modules[tm]]][[database]] %>%
mutate(pval_mean = p.adjust_ORA + p.adjust_GSEA) %>% arrange(pval_mean) %>%
dplyr::select(ID, geneID)
if(nrow(plot_data)>=2){
plot_data = plot_data %>% slice_head(n=5)
shared_genes = matrix(0, nrow(plot_data), nrow(plot_data))
for(i in 1:(nrow(plot_data)-1)){
for(j in (i+1):nrow(plot_data)){
gene_set_1 = strsplit(plot_data$geneID[i], '/') %>% unlist
gene_set_2 = strsplit(plot_data$geneID[j], '/') %>% unlist
shared_genes[i,j] = sum(gene_set_1 %in% gene_set_2)/length(unique(c(gene_set_1, gene_set_2)))
shared_genes[j,i] = shared_genes[i,j]
}
}
rownames(shared_genes) = plot_data$ID
colnames(shared_genes) = plot_data$ID
corrplot(shared_genes, type = 'lower', method = 'square', diag = FALSE, number.digits = 2, cl.pos = 'n',
tl.pos = 'ld', tl.col = '#666666', order = 'hclust', col.lim = c(0,1), addCoef.col = 'black',
mar = c(0,0,2,0), tl.cex = 0.8, number.cex= 0.8,
title = paste0('Genes in common for top terms in cluster ',
genes_info$module_number[genes_info$Module==top_modules[tm]][1]))
}
}
}
# Print table with top results (for annex in thesis)
print_table_w_top_results = function(top_modules_enrichment, module, database, n){
enriched_terms = top_modules_enrichment[[module]][[database]] %>%
mutate(pval_mean = p.adjust_ORA + p.adjust_GSEA) %>% arrange(pval_mean) %>%
top_n(-n, wt=pval_mean) %>% dplyr::rename('Description' = Description.x) %>%
dplyr::select(ID, Description, p.adjust_GSEA, p.adjust_ORA, NES, GeneRatio) %>%
xtable(display =c('s','s','s','e','e','f','s'))
return(print(enriched_terms, include.rownames=FALSE))
}
#print_table_w_top_results(selected_modules_enrichment, names(selected_modules_enrichment)[2], 'DN', 5)
# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
# WGCNA metrics
WGCNA_metrics = read.csv('./../Data/preprocessedData/WGCNA_metrics.csv')
# Updates genes_info with SFARI information and clusters
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>%
left_join(datGenes %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, hgnc_symbol), by = 'ID') %>%
dplyr::select(ID, hgnc_symbol, log2FoldChange, shrunken_log2FoldChange, significant, Neuronal) %>%
left_join(WGCNA_metrics, by = 'ID') %>% dplyr::select(-contains('pval'))
################################################################################################################
# Get entrezene ID of genes
gene_names = genes_info %>% dplyr::rename('ensembl_gene_id' = ID) %>% filter(Module!='gray')
# ClusterProfile works with Entrez Gene Ids, o we have to assign one to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart=useMart(biomart='ENSEMBL_MART_ENSEMBL',dataset='hsapiens_gene_ensembl',host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'),
values=gene_names$ensembl_gene_id, mart=mart)
gene_names = biomart_output %>% left_join(gene_names %>% dplyr::select(ensembl_gene_id, hgnc_symbol),
by='ensembl_gene_id') %>% dplyr::rename('ID'=ensembl_gene_id)
rm(getinfo, mart, biomart_output)
rm(dds, WGCNA_metrics)
Both GSEA and ORA are commonly used to study enrichment in sets of genes, but when using them for studying our modules both have shortcomings:
GSEA takes into consideration some ordering of the genes, in this case given by their Module Membership, which is correlated to the membership of genes to the module, but has two problems:
Being a continuous scale, it doesn’t separate by a threshold the genes that are truly in the cluster from the rest
The Module Membership metric is correlated to the real membership of the module, but this correlation is not perfect: a high MM doesn’t always mean the gene belongs to that module, for example, selecting a random module, in the plot below we can see the MM distribution of genes belonging to that module against the rest of the genes belonging to other modules and, although in general, genes belonging to that module have a higher distribution of MM, there is still a big overlap between the two groups, making MM a noisy metric for performing GSEA
module = genes_info %>% filter(abs(MTcor) > 0.7) %>% slice_head(n=1) %>% pull(Module) %>% as.character
plot_data = genes_info %>% dplyr::select(Module, paste0('MM.',gsub('#','',module))) %>%
mutate(in_module = substring(Module,2) == gsub('#','',module),
selected_module = paste('Cluster', genes_info$module_number[genes_info$Module==module][1] %>%
as.character)) %>%
mutate(alpha = ifelse(in_module, 0.8, 0.1))
colnames(plot_data)[2] = 'MM'
p = plot_data %>% ggplot(aes(selected_module, MM, color = in_module)) + geom_jitter(alpha = plot_data$alpha) +
xlab('') + ylab(paste('Cluster membership to cluster',
genes_info$module_number[genes_info$Module==module][1])) + coord_flip() +
theme_minimal() + theme(legend.position = 'bottom', axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(color = paste('Gene belongs to cluster', genes_info$module_number[genes_info$Module==module][1]))
ggExtra::ggMarginal(p, type = 'density', groupColour = TRUE, groupFill = TRUE, margins = 'x', size=1)
rm(modules, module, p, plot_data)
So perhaps it could be useful to use both methods together, since they seem to complement each other’s shortcomings very well, performing the enrichment using both methods and identifying the terms that are found to be enriched by both
Note: Since the enrichment in both methods is quite a stric restriction, we decide to relax the corrected p-value threshold (using Bonferroni correction) to 0.1.
Note: This script may take a bit to run (~30 mins with an 8 core Intel(R) Core(TM) i5-8400H CPU @ 2.50GHz laptop) and sometimes there are problems with the API and it will freeze or kill the process printing ‘error writing to connection’, but this when this has happened, it has been fixed in less than a day (except once that took 4 days…).
#top_modules = c('#44A0FF', '#D177FF', '#F47B5B', '#00BADE', '#64B200', '#DD71FA')
# Modules: 6,38,63,22,34,46
top_modules = c('#00B2F3','#529EFF','#CA9700','#00BF75','#1BB700','#8893FF')
if(file.exists('./../Data/preprocessedData/top_modules_enrichment.RData')){
load('./../Data/preprocessedData/top_modules_enrichment.RData')
load('./../Data/preprocessedData/GSEA_results.RData')
load('./../Data/preprocessedData/ORA_results.RData')
} else{
################################################################################################################
# Prepare dataset for Enrichment Analysis
EA_dataset = genes_info %>% dplyr::rename('ensembl_gene_id' = ID) %>% filter(Module!='gray')
# ClusterProfile works with Entrez Gene Ids, o we have to assign one to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart=useMart(biomart='ENSEMBL_MART_ENSEMBL',dataset='hsapiens_gene_ensembl',host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'),
values=EA_dataset$ensembl_gene_id, mart=mart)
EA_dataset = biomart_output %>% left_join(EA_dataset, by='ensembl_gene_id') %>% dplyr::rename('ID'=ensembl_gene_id)
rm(getinfo, mart, biomart_output)
################################################################################################################
# GSEA enrichment
file_name = './../Data/preprocessedData/GSEA_results.RData'
if(file.exists(file_name)){
load(file_name)
} else {
nPerm = 1e5
GSEA_dataset = EA_dataset %>% dplyr::select(ID, entrezgene, contains('MM.'))
GSEA_enrichment = list()
for(module in top_modules){
cat(paste0('\nModule: ', which(top_modules == module), '/', length(top_modules)))
geneList = GSEA_dataset %>% pull(paste0('MM.',substring(module,2)))
names(geneList) = GSEA_dataset %>% pull(entrezgene) %>% as.character
geneList = sort(geneList, decreasing = TRUE)
GSEA_GO = gseGO(geneList, OrgDb = org.Hs.eg.db, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_DO = gseDO(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_DGN = gseDGN(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_KEGG = gseKEGG(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_Reactome = gsePathway(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_enrichment[[module]] = list('GO' = GSEA_GO, 'DO' = GSEA_DO, 'DGN' = GSEA_DGN, 'KEGG' = GSEA_KEGG,
'Reactome' = GSEA_Reactome)
# Save after each iteration (in case it breaks)
save(GSEA_enrichment, file = file_name)
}
rm(GSEA_dataset, nPerm, geneList, GSEA_GO, GSEA_DO, GSEA_DGN, GSEA_KEGG, GSEA_Reactome)
}
################################################################################################################
# ORA enrichment
file_name = './../Data/preprocessedData/ORA_results.RData'
if(file.exists(file_name)){
load(file_name)
} else {
# Prepare input
universe = EA_dataset$entrezgene %>% as.character
# Perform Enrichment
ORA_enrichment = list()
for(module in top_modules){
genes_in_module = EA_dataset %>% filter(Module == module) %>% pull(entrezgene)
ORA_GO = enrichGO(gene = genes_in_module, universe = universe, OrgDb = org.Hs.eg.db, ont = 'All',
pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, qvalueCutoff = 1)
ORA_DO = enrichDO(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
ORA_DGN = enrichDGN(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
ORA_KEGG = enrichKEGG(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
ORA_Reactome = enrichPathway(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
ORA_enrichment[[module]] = list('GO' = ORA_GO, 'DO' = ORA_DO, 'DGN' = ORA_DGN, 'KEGG' = ORA_KEGG,
'Reactome' = ORA_Reactome)
# Save after each iteration
save(ORA_enrichment, file = file_name)
}
rm(universe, genes_in_module, module, ORA_GO, ORA_DGN, ORA_DO, ORA_KEGG, ORA_Reactome)
}
################################################################################################################
# Get shared enrichment for each module
top_modules_enrichment = list()
for(module in top_modules){
module_enrichment = list()
GSEA_enrichment_for_module = GSEA_enrichment[[module]]
ORA_enrichment_for_module = ORA_enrichment[[module]]
for(dataset in c('KEGG', 'Reactome', 'GO', 'DO', 'DGN')){
GSEA_enrichment_dataset = GSEA_enrichment_for_module[[dataset]] %>% data.frame %>%
dplyr::rename('pvalue_GSEA' = pvalue, 'p.adjust_GSEA' = p.adjust, 'qvalues_GSEA' = qvalues)
ORA_enrichment_dataset = ORA_enrichment_for_module[[dataset]] %>% data.frame %>%
dplyr::rename('pvalue_ORA' = pvalue, 'p.adjust_ORA' = p.adjust, 'qvalue_ORA' = qvalue)
# Get shared enrichments (if any)
shared_enrichment_dataset = GSEA_enrichment_dataset %>% inner_join(ORA_enrichment_dataset, by = 'ID')
module_enrichment[[dataset]] = shared_enrichment_dataset
}
top_modules_enrichment[[module]] = module_enrichment
}
save(top_modules_enrichment, file = './../Data/preprocessedData/top_modules_enrichment.RData')
rm(module, module_enrichment, GSEA_enrichment_for_module, ORA_enrichment_for_module, dataset,
GSEA_enrichment_dataset, ORA_enrichment_dataset, shared_enrichment_dataset)
}
top_modules_mtcor = top_modules[1:3]
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'GO')
Enrichment results for cluster 6:
- GSEA has 245 enriched term(s)
- ORA has 20 enriched term(s)
- 14 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| GO:0060333 | interferon-gamma-mediated signaling pathway | 0.0000061 | 0.0886604 | 0.0000024 | 9/72 |
| GO:0034341 | response to interferon-gamma | 0.0001111 | 0.0888568 | 0.0000171 | 10/72 |
| GO:0071346 | cellular response to interferon-gamma | 0.0004974 | 0.0889945 | 0.0000658 | 9/72 |
| GO:0019221 | cytokine-mediated signaling pathway | 0.0000079 | 0.0899264 | 0.0000024 | 18/72 |
| GO:0045087 | innate immune response | 0.0000079 | 0.0899416 | 0.0000024 | 19/72 |
| GO:0034097 | response to cytokine | 0.0000490 | 0.0903737 | 0.0000113 | 21/72 |
| GO:0071345 | cellular response to cytokine stimulus | 0.0000882 | 0.0903584 | 0.0000163 | 20/72 |
| GO:0002252 | immune effector process | 0.0017538 | 0.0907519 | 0.0002030 | 19/72 |
| GO:0009607 | response to biotic stimulus | 0.0142368 | 0.0899074 | 0.0014646 | 14/72 |
| GO:0043207 | response to external biotic stimulus | 0.0413151 | 0.0900083 | 0.0034776 | 13/72 |
| GO:0051707 | response to other organism | 0.0413151 | 0.0900083 | 0.0034776 | 13/72 |
| GO:0098542 | defense response to other organism | 0.0557650 | 0.0892505 | 0.0043027 | 9/72 |
| GO:0050778 | positive regulation of immune response | 0.0624235 | 0.0899530 | 0.0044459 | 13/72 |
| GO:0002460 | adaptive immune response based on somatic recombination of immune receptors built from immunoglobulin superfamily domains | 0.0771116 | 0.0890169 | 0.0050998 | 7/72 |
Genes involved in top 5 enriched terms: ANXA1, BST2, C1R, C1S, CD44, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, MR1, NLRC5, PARP9, RAB27A, TNFRSF1A
Genes involved in all enriched terms: ADAM9, ALPK1, ANXA1, ANXA2, APOL1, BST2, C1R, C1S, CD44, CHI3L1, CRISPLD2, DDIT3, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IFI35, IL18BP, IL6R, IRF7, MAOB, MR1, NEXN, NFIL3, NLRC5, PAK2, PARP9, PIK3R2, PYGL, RAB27A, SDC1, SERPINA3, SURF4, TNFRSF1A
Enrichment results for cluster 38:
- GSEA has 37 enriched term(s)
- ORA has 0 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 63:
- GSEA has 59 enriched term(s)
- ORA has 0 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_mtcor, 'GO')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'GO')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'DO')
Enrichment results for cluster 6:
- GSEA has 194 enriched term(s)
- ORA has 359 enriched term(s)
- 8 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| DOID:77 | gastrointestinal system disease | 0.0100986 | 0.0117697 | 0.0047845 | 15/44 |
| DOID:1909 | melanoma | 0.0164602 | 0.0117487 | 0.0047845 | 14/44 |
| DOID:65 | connective tissue disease | 0.0205683 | 0.0117780 | 0.0047845 | 16/44 |
| DOID:409 | liver disease | 0.0308422 | 0.0116701 | 0.0052775 | 12/44 |
| DOID:3118 | hepatobiliary disease | 0.0378128 | 0.0116659 | 0.0052775 | 12/44 |
| DOID:0080001 | bone disease | 0.0557948 | 0.0117725 | 0.0055805 | 14/44 |
| DOID:557 | kidney disease | 0.0559777 | 0.0116492 | 0.0055805 | 10/44 |
| DOID:18 | urinary system disease | 0.0673986 | 0.0116475 | 0.0058792 | 10/44 |
Genes involved in top 5 enriched terms: ADAM9, ANXA2, APOL1, CD44, CHI3L1, COL4A1, CTNND1, DDIT3, FLNA, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, NAMPT, PAK2, RAB27A, S100A10, SDC1, SERPINA3, TNFRSF1A
Genes involved in all enriched terms: ADAM9, ANXA1, ANXA2, APOL1, CD44, CHI3L1, COL4A1, CTNND1, DDIT3, FLNA, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, NAMPT, PAK2, PDLIM4, POR, RAB27A, S100A10, SDC1, SERPINA3, SLC26A2, TNFRSF1A
Enrichment results for cluster 38:
- GSEA has 33 enriched term(s)
- ORA has 352 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 63:
- GSEA has 93 enriched term(s)
- ORA has 436 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_mtcor, 'DO')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'DO')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'DGN')
Enrichment results for cluster 6:
- GSEA has 326 enriched term(s)
- ORA has 1260 enriched term(s)
- 19 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| umls:C0036202 | Sarcoidosis | 0.0006259 | 0.0453306 | 0.0002160 | 9/67 |
| umls:C0023895 | Liver diseases | 0.0002951 | 0.0457957 | 0.0002036 | 13/67 |
| umls:C0024115 | Lung diseases | 0.0033629 | 0.0454935 | 0.0007735 | 10/67 |
| umls:C0677607 | Hashimoto Disease | 0.0064239 | 0.0451914 | 0.0008705 | 6/67 |
| umls:C0021368 | Inflammation | 0.0075685 | 0.0458942 | 0.0008705 | 12/67 |
| umls:C0021390 | Inflammatory Bowel Diseases | 0.0106762 | 0.0459959 | 0.0010090 | 14/67 |
| umls:C0409651 | Seropositive rheumatoid arthritis | 0.0131302 | 0.0451427 | 0.0010090 | 4/67 |
| umls:C1800706 | Idiopathic Pulmonary Fibrosis | 0.0141286 | 0.0454432 | 0.0010090 | 9/67 |
| umls:C0162871 | Aortic Aneurysm, Abdominal | 0.0146213 | 0.0452816 | 0.0010090 | 7/67 |
| umls:C0042373 | Vascular Diseases | 0.0170781 | 0.0455706 | 0.0010182 | 10/67 |
| umls:C0009324 | Ulcerative Colitis | 0.0177055 | 0.0459426 | 0.0010182 | 13/67 |
| umls:C0042164 | Uveitis | 0.0215871 | 0.0453599 | 0.0011459 | 6/67 |
| umls:C0004153 | Atherosclerosis | 0.0240654 | 0.0463680 | 0.0011862 | 17/67 |
| umls:C0019196 | Hepatitis C | 0.0350402 | 0.0458690 | 0.0015992 | 12/67 |
| umls:C0034069 | Pulmonary Fibrosis | 0.0370794 | 0.0455296 | 0.0015992 | 9/67 |
| umls:C0242583 | Bare Lymphocyte Syndrome | 0.0422210 | 0.0451923 | 0.0016186 | 4/67 |
| umls:C2931418 | Bare lymphocyte syndrome 2 | 0.0422210 | 0.0451923 | 0.0016186 | 4/67 |
| umls:C0025007 | Measles | 0.0521393 | 0.0452571 | 0.0018569 | 6/67 |
| umls:C0031099 | Periodontitis | 0.0837657 | 0.0453618 | 0.0026274 | 7/67 |
Genes involved in top 5 enriched terms: ANXA2, BAG3, C1S, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL6R, S100A10, SDC1, SLC3A2, TNFRSF1A
Genes involved in all enriched terms: ADAM9, ALPK1, ANGPTL4, ANXA1, ANXA2, APOL1, BAG3, C1S, CD44, CHI3L1, COL4A1, CRISPLD2, DDIT3, FLNA, GBP1, HAP1, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, MAOB, MR1, NAMPT, NEXN, NFIL3, PAK2, PARP9, S100A10, SDC1, SERPINA3, SLC3A2, SP110, TNFRSF1A, TRIM69
Enrichment results for cluster 38:
- GSEA has 28 enriched term(s)
- ORA has 1299 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 63:
- GSEA has 69 enriched term(s)
- ORA has 1610 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_mtcor, 'DGN')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'DGN')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'KEGG')
Enrichment results for cluster 6:
- GSEA has 42 enriched term(s)
- ORA has 175 enriched term(s)
- 8 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| hsa04640 | Hematopoietic cell lineage | 0.0018028 | 0.0062067 | 0.0008968 | 5/36 |
| hsa05150 | Staphylococcus aureus infection | 0.0021765 | 0.0061988 | 0.0008968 | 5/36 |
| hsa05164 | Influenza A | 0.0205482 | 0.0062136 | 0.0056443 | 6/36 |
| hsa05322 | Systemic lupus erythematosus | 0.0289963 | 0.0062168 | 0.0059737 | 5/36 |
| hsa05320 | Autoimmune thyroid disease | 0.0823203 | 0.0062032 | 0.0085058 | 3/36 |
| hsa05330 | Allograft rejection | 0.0823203 | 0.0062032 | 0.0085058 | 3/36 |
| hsa05332 | Graft-versus-host disease | 0.0823203 | 0.0062032 | 0.0085058 | 3/36 |
| hsa05169 | Epstein-Barr virus infection | 0.0825746 | 0.0062284 | 0.0085058 | 6/36 |
Genes involved in top 5 enriched terms: C1R, C1S, CD44, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL6R
Genes involved in all enriched terms: C1R, C1S, CD44, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL6R, IRF7, PIK3R2, TNFRSF1A
Enrichment results for cluster 38:
- GSEA has 3 enriched term(s)
- ORA has 155 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 63:
- GSEA has 6 enriched term(s)
- ORA has 209 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_mtcor, 'KEGG')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'KEGG')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'Reactome')
Enrichment results for cluster 6:
- GSEA has 20 enriched term(s)
- ORA has 232 enriched term(s)
- 7 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| R-HSA-913531 | Interferon Signaling | 0.0000928 | 0.0201370 | 0.0000402 | 10/58 |
| R-HSA-1280215 | Cytokine Signaling in Immune system | 0.0000064 | 0.0205247 | 0.0000056 | 18/58 |
| R-HSA-877300 | Interferon gamma signaling | 0.0011405 | 0.0200244 | 0.0003294 | 7/58 |
| R-HSA-202433 | Generation of second messenger molecules | 0.0114849 | 0.0199493 | 0.0024882 | 4/58 |
| R-HSA-168249 | Innate Immune System | 0.0627832 | 0.0206425 | 0.0068010 | 16/58 |
| R-HSA-202427 | Phosphorylation of CD3 and TCR zeta chains | 0.0619852 | 0.0597430 | 0.0068010 | 3/58 |
| R-HSA-389948 | PD-1 signaling | 0.0619852 | 0.0995716 | 0.0068010 | 3/58 |
Genes involved in top 5 enriched terms: ANXA1, ANXA2, ATP6V1C2, BST2, C1R, C1S, CD44, CHI3L1, CRISPLD2, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IFI35, IL18BP, IL6R, IRF7, NLRC5, PAK2, PIK3R2, PYGL, RAB27A, SDC1, SERPINA3, SURF4, TNFRSF1A
Genes involved in all enriched terms: ANXA1, ANXA2, ATP6V1C2, BST2, C1R, C1S, CD44, CHI3L1, CRISPLD2, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IFI35, IL18BP, IL6R, IRF7, NLRC5, PAK2, PIK3R2, PYGL, RAB27A, SDC1, SERPINA3, SURF4, TNFRSF1A
Enrichment results for cluster 38:
- GSEA has 15 enriched term(s)
- ORA has 405 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 63:
- GSEA has 22 enriched term(s)
- ORA has 591 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_mtcor, 'Reactome')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'Reactome')
top_modules_SFARI = top_modules[4:6]
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'GO')
Enrichment results for cluster 22:
- GSEA has 53 enriched term(s)
- ORA has 12 enriched term(s)
- 2 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| GO:0045944 | positive regulation of transcription by RNA polymerase II | 0.0002441 | 0.0484942 | 0.0002145 | 12/30 |
| GO:0045444 | fat cell differentiation | 0.0444174 | 0.0607138 | 0.0195132 | 5/30 |
Genes involved in all enriched terms: CSRNP1, EGR1, EGR2, EGR4, FOSL2, HES1, IER2, KLF10, NPAS4, NR4A1, NR4A3, PER2, RPS6KA3
Enrichment results for cluster 34:
- GSEA has 85 enriched term(s)
- ORA has 2 enriched term(s)
- 1 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| GO:0007215 | glutamate receptor signaling pathway | 0.0033498 | 0.0645637 | 0.0032175 | 7/75 |
Genes involved in all enriched terms: CX3CL1, GRIA1, GRIK3, GRM2, MINK1, NLGN3, SSTR1
Enrichment results for cluster 46:
- GSEA has 13 enriched term(s)
- ORA has 0 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_SFARI, 'GO')
plot_shared_genes(top_modules_enrichment, top_modules_SFARI, 'GO')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'DO')
Enrichment results for cluster 22:
- GSEA has 59 enriched term(s)
- ORA has 190 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 34:
- GSEA has 2 enriched term(s)
- ORA has 276 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 46:
- GSEA has 6 enriched term(s)
- ORA has 112 enriched term(s)
- 0 terms are enriched in both methods
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'DGN')
Enrichment results for cluster 22:
- GSEA has 67 enriched term(s)
- ORA has 704 enriched term(s)
- 3 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| umls:C0476089 | Endometrial Carcinoma | 0.0351175 | 0.0264086 | 0.0163471 | 8/30 |
| umls:C0740392 | Infarction, Middle Cerebral Artery | 0.0387641 | 0.0336964 | 0.0163471 | 4/30 |
| umls:C0151744 | Myocardial Ischemia | 0.0722839 | 0.0272557 | 0.0163471 | 7/30 |
Genes involved in all enriched terms: ARC, BRAF, CTSB, DUSP6, EGR1, EGR2, EGR4, FOSL2, HES1, NR4A1, PER2, SPRY2, TRIB1
Enrichment results for cluster 34:
- GSEA has 0 enriched term(s)
- ORA has 779 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 46:
- GSEA has 9 enriched term(s)
- ORA has 424 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_SFARI, 'DGN')
plot_shared_genes(top_modules_enrichment, top_modules_SFARI, 'DGN')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'KEGG')
Enrichment results for cluster 22:
- GSEA has 7 enriched term(s)
- ORA has 77 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 34:
- GSEA has 19 enriched term(s)
- ORA has 178 enriched term(s)
- 1 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| hsa04080 | Neuroactive ligand-receptor interaction | 0.0009935 | 0.0041954 | 0.0009342 | 8/35 |
Genes involved in all enriched terms: CNR1, GABRA5, GABRB3, GRIA1, GRIK3, GRM2, LYPD6B, SSTR1
Enrichment results for cluster 46:
- GSEA has 24 enriched term(s)
- ORA has 94 enriched term(s)
- 0 terms are enriched in both methods
Plots of the results when there are more than 5 terms in common between methods:
plot_results(top_modules_enrichment, top_modules_SFARI, 'KEGG')
plot_shared_genes(top_modules_enrichment, top_modules_SFARI, 'KEGG')
compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'Reactome')
Enrichment results for cluster 22:
- GSEA has 2 enriched term(s)
- ORA has 126 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 34:
- GSEA has 49 enriched term(s)
- ORA has 287 enriched term(s)
- 0 terms are enriched in both methods
Enrichment results for cluster 46:
- GSEA has 30 enriched term(s)
- ORA has 131 enriched term(s)
- 0 terms are enriched in both methods
# Get cluster name for clusters with numbers 38, 63, and 46
selected_modules = c(genes_info %>% filter(module_number==38) %>% slice_head(1) %>% pull(Module) %>% as.character,
genes_info %>% filter(module_number==63) %>% slice_head(1) %>% pull(Module) %>% as.character,
genes_info %>% filter(module_number==46) %>% slice_head(1) %>% pull(Module) %>% as.character)
if(file.exists('./../Data/preprocessedData/top_modules_enrichment_relaxed.RData')){
load('./../Data/preprocessedData/top_modules_enrichment_relaxed.RData')
load('./../Data/preprocessedData/GSEA_results_relaxed.RData')
load('./../Data/preprocessedData/ORA_results_relaxed.RData')
} else{
pvalueCutoff = 0.5
################################################################################################################
# Prepare dataset for Enrichment Analysis
EA_dataset = genes_info %>% dplyr::rename('ensembl_gene_id' = ID) %>% filter(Module!='gray')
# ClusterProfile works with Entrez Gene Ids, o we have to assign one to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart=useMart(biomart='ENSEMBL_MART_ENSEMBL',dataset='hsapiens_gene_ensembl',host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'),
values=EA_dataset$ensembl_gene_id, mart=mart)
EA_dataset = biomart_output %>% left_join(EA_dataset, by='ensembl_gene_id') %>%
dplyr::rename('ID'=ensembl_gene_id) %>% distinct(entrezgene, .keep_all = TRUE)
rm(getinfo, mart, biomart_output)
################################################################################################################
# GSEA enrichment
file_name = './../Data/preprocessedData/GSEA_results_relaxed.RData'
if(file.exists(file_name)){
load(file_name)
} else {
cat('\n\nPerforming GSEA\n')
nPerm = 1e5
GSEA_dataset = EA_dataset %>% dplyr::select(ID, entrezgene, contains('MM.'))
GSEA_enrichment = list()
for(module in selected_modules){
cat(paste0('\nModule: ', which(selected_modules == module), '/', length(selected_modules)))
geneList = GSEA_dataset %>% pull(paste0('MM.',substring(module,2)))
names(geneList) = GSEA_dataset %>% pull(entrezgene) %>% as.character
geneList = sort(geneList, decreasing = TRUE)
GSEA_GO = gseGO(geneList, OrgDb = org.Hs.eg.db, pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_DO = gseDO(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_DGN = gseDGN(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_KEGG = gseKEGG(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_Reactome = gsePathway(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff,
nPerm = nPerm, verbose = FALSE, seed = TRUE)
GSEA_enrichment[[module]] = list('GO' = GSEA_GO, 'DO' = GSEA_DO, 'DGN' = GSEA_DGN, 'KEGG' = GSEA_KEGG,
'Reactome' = GSEA_Reactome)
# Save after each iteration (in case it breaks)
save(GSEA_enrichment, file = file_name)
}
rm(GSEA_dataset, nPerm, geneList, GSEA_GO, GSEA_DO, GSEA_DGN, GSEA_KEGG, GSEA_Reactome)
}
################################################################################################################
# ORA enrichment
file_name = './../Data/preprocessedData/ORA_results_relaxed.RData'
if(file.exists(file_name)){
load(file_name)
} else {
cat('\n\nPerforming ORA\n')
# Prepare input
universe = EA_dataset$entrezgene %>% as.character
# Perform Enrichment
ORA_enrichment = list()
for(module in selected_modules){
cat(paste0('\nModule: ', which(selected_modules == module), '/', length(selected_modules)))
genes_in_module = EA_dataset %>% filter(Module == module) %>% pull(entrezgene)
ORA_GO = enrichGO(gene = genes_in_module, universe = universe, OrgDb = org.Hs.eg.db, ont = 'All',
pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, qvalueCutoff = 1)
ORA_DO = enrichDO(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
ORA_DGN = enrichDGN(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
ORA_KEGG = enrichKEGG(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
ORA_Reactome = enrichPathway(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
ORA_enrichment[[module]] = list('GO' = ORA_GO, 'DO' = ORA_DO, 'DGN' = ORA_DGN, 'KEGG' = ORA_KEGG,
'Reactome' = ORA_Reactome)
# Save after each iteration (in case it breaks)
save(ORA_enrichment, file = file_name)
}
rm(universe, genes_in_module, module, ORA_GO, ORA_DGN, ORA_DO, ORA_KEGG, ORA_Reactome)
}
################################################################################################################
# Get shared enrichment for each module
selected_modules_enrichment = list()
for(module in selected_modules){
module_enrichment = list()
GSEA_enrichment_for_module = GSEA_enrichment[[module]]
ORA_enrichment_for_module = ORA_enrichment[[module]]
for(dataset in c('KEGG', 'Reactome', 'GO', 'DO', 'DGN')){
GSEA_enrichment_dataset = GSEA_enrichment_for_module[[dataset]] %>% data.frame %>%
dplyr::rename('pvalue_GSEA' = pvalue, 'p.adjust_GSEA' = p.adjust, 'qvalues_GSEA' = qvalues)
ORA_enrichment_dataset = ORA_enrichment_for_module[[dataset]] %>% data.frame %>%
dplyr::rename('pvalue_ORA' = pvalue, 'p.adjust_ORA' = p.adjust, 'qvalue_ORA' = qvalue)
# Get shared enrichments (if any)
shared_enrichment_dataset = GSEA_enrichment_dataset %>% inner_join(ORA_enrichment_dataset, by = 'ID')
module_enrichment[[dataset]] = shared_enrichment_dataset
}
selected_modules_enrichment[[module]] = module_enrichment
}
save(selected_modules_enrichment, file = './../Data/preprocessedData/top_modules_enrichment_relaxed.RData')
rm(module, module_enrichment, GSEA_enrichment_for_module, ORA_enrichment_for_module, dataset,
GSEA_enrichment_dataset, ORA_enrichment_dataset, shared_enrichment_dataset)
}
Relaxing the p-value only worked for one of the modules, the other still has zero elements in common between the GSEA and ORA results
compare_methods(GSEA_enrichment, ORA_enrichment, selected_modules_enrichment, selected_modules[1], 'KEGG')
Enrichment results for cluster 38:
- GSEA has 7 enriched term(s)
- ORA has 155 enriched term(s)
- 1 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| hsa05168 | Herpes simplex virus 1 infection | 0.49945 | 0.0037757 | 0.49945 | 10/72 |
Genes involved in all enriched terms: JAK1, TP53, ZNF12, ZNF222, ZNF275, ZNF430, ZNF432, ZNF529, ZNF549, ZNF563
compare_methods(GSEA_enrichment, ORA_enrichment, selected_modules_enrichment, selected_modules[2], 'DO')
Enrichment results for cluster 63:
- GSEA has 141 enriched term(s)
- ORA has 436 enriched term(s)
- 2 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| DOID:0050615 | respiratory system cancer | 0.1860411 | 0.0193661 | 0.0592858 | 17/88 |
| DOID:4074 | pancreas adenocarcinoma | 0.2226042 | 0.3080494 | 0.0592858 | 8/88 |
Genes involved in all enriched terms: BCL2, CD59, CD63, CEBPG, EML4, ENO1, ERBB2, F3, FGF2, INHBA, ITGAV, LGALS3, LIMS1, MDM2, MGST1, MTHFR, NQO1, SH3GLB1, TUBB2B
compare_methods(GSEA_enrichment, ORA_enrichment, selected_modules_enrichment, selected_modules[2], 'DGN')
Enrichment results for cluster 63:
- GSEA has 137 enriched term(s)
- ORA has 1610 enriched term(s)
- 1 terms are enriched in both methods
| ID | Description | p.adjust_ORA | p.adjust_GSEA | qvalue_ORA | GeneRatio |
|---|---|---|---|---|---|
| umls:C0008925 | Cleft Palate | 0.2953576 | 0.0798431 | 0.1200704 | 11/174 |
Genes involved in all enriched terms: ALDH1L1, BRWD3, CBFB, CHUK, ERBB2, EVC, FGF2, MID1, MTHFR, SDC2, TGFB2
Note: I am using a corrected p-value threhsold of 0.05, since the relaxation was only because we were combining these results with the ones from the ORA
load('./../Data/preprocessedData/GSEA_results.RData')
load('./../Data/preprocessedData/ORA_results.RData')
print_GSEA_top_results = function(module, n){
for(database in c('GO','DO','DGN','KEGG','Reactome')){
res = GSEA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05 & NES>0) %>%
dplyr::select(ID, Description, NES, p.adjust, qvalues) %>% arrange(desc(NES)) %>% top_n(n, wt=NES)
cat(paste0('\n',database,':\n'))
if(nrow(res)>0){
print(res %>% kable %>% kable_styling(full_width = F))
#print(xtable(res, display =c('s','s','s','f','e','e')), include.rownames=FALSE) # thesis
} else {
cat('\nNo enriched terms found\n\n\n')
}
}
}
plot_shared_genes_GSEA = function(module, n){
for(database in c('GO','DO','DGN','KEGG','Reactome')){
plot_data = GSEA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05 & NES>0) %>%
arrange(desc(NES)) %>% dplyr::select(ID, core_enrichment) %>% slice_head(n=n)
if(nrow(plot_data)>1){
shared_genes = matrix(0, nrow(plot_data), nrow(plot_data))
for(i in 1:(nrow(plot_data)-1)){
for(j in (i+1):nrow(plot_data)){
gene_set_1 = strsplit(plot_data$core_enrichment[i], '/') %>% unlist %>% unique
gene_set_2 = strsplit(plot_data$core_enrichment[j], '/') %>% unlist %>% unique
shared_genes[i,j] = sum(gene_set_1 %in% gene_set_2)/length(unique(c(gene_set_1, gene_set_2)))
shared_genes[j,i] = shared_genes[i,j]
}
}
rownames(shared_genes) = plot_data$ID
colnames(shared_genes) = plot_data$ID
corrplot(shared_genes, type = 'lower', method = 'square', diag = FALSE, number.digits = 2, cl.pos = 'n',
tl.pos = 'ld', tl.col = '#666666', order = 'hclust', col.lim = c(0,1), addCoef.col = 'black',
mar = c(0,0,2,0), tl.cex = 0.8, number.cex= 0.8,
title = paste0('Genes in common in the ',database, ' database for cluster ',
genes_info$module_number[genes_info$Module==module][1]))
}
}
}
print_ORA_top_results = function(module, n){
for(database in c('GO','DO','DGN','KEGG','Reactome')){
res = ORA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05) %>%
dplyr::select(ID, Description, p.adjust, qvalue, GeneRatio) %>% arrange(p.adjust) %>%
top_n(n, wt=p.adjust)
cat(paste0('\n',database,':\n'))
if(nrow(res)>0){
print(res %>% kable %>% kable_styling(full_width = F))
#print(xtable(res, display =c('s','s','s','e','e','s')), include.rownames=FALSE) # thesis
} else {
cat('\nNo enriched terms found\n\n\n')
}
}
}
plot_shared_genes_ORA = function(module, n){
for(database in c('GO','DO','DGN','KEGG','Reactome')){
plot_data = ORA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05) %>%
arrange(desc(p.adjust)) %>% dplyr::select(ID, geneID) %>% slice_head(n=n)
if(nrow(plot_data)>1){
shared_genes = matrix(0, nrow(plot_data), nrow(plot_data))
for(i in 1:(nrow(plot_data)-1)){
for(j in (i+1):nrow(plot_data)){
gene_set_1 = strsplit(plot_data$core_enrichment[i], '/') %>% unlist %>% unique
gene_set_2 = strsplit(plot_data$core_enrichment[j], '/') %>% unlist %>% unique
shared_genes[i,j] = sum(gene_set_1 %in% gene_set_2)/length(unique(c(gene_set_1, gene_set_2)))
shared_genes[j,i] = shared_genes[i,j]
}
}
rownames(shared_genes) = plot_data$ID
colnames(shared_genes) = plot_data$ID
corrplot(shared_genes, type = 'lower', method = 'square', diag = FALSE, number.digits = 2, cl.pos = 'n',
tl.pos = 'ld', tl.col = '#666666', order = 'hclust', col.lim = c(0,1), addCoef.col = 'black',
mar = c(0,0,2,0), tl.cex = 0.8, number.cex= 0.8,
title = paste0('Genes in common in the ',database, ' database for cluster ',
genes_info$module_number[genes_info$Module==module][1]))
}
}
}
print_GSEA_top_results(selected_modules[3], 5)
GO:
| ID | Description | NES | p.adjust | qvalues | |
|---|---|---|---|---|---|
| GO:0045892 | GO:0045892 | negative regulation of transcription, DNA-templated | 1.655441 | 0.0490487 | 0.0026761 |
| ID | Description | NES | p.adjust | qvalues | |
|---|---|---|---|---|---|
| DOID:0060040 | DOID:0060040 | pervasive developmental disorder | 2.064808 | 0.0160467 | 0.0040484 |
| DOID:0060041 | DOID:0060041 | autism spectrum disorder | 2.041408 | 0.0161937 | 0.0040484 |
| DOID:12849 | DOID:12849 | autistic disorder | 2.041408 | 0.0161937 | 0.0040484 |
| DOID:0060037 | DOID:0060037 | developmental disorder of mental health | 1.968701 | 0.0073923 | 0.0040484 |
| ID | Description | NES | p.adjust | qvalues | |
|---|---|---|---|---|---|
| umls:C0008074 | umls:C0008074 | Child Development Disorders, Pervasive | 2.208991 | 0.0360512 | 0.0050244 |
| umls:C0036857 | umls:C0036857 | Severe mental retardation (I.Q. 20-34) | 2.172130 | 0.0361686 | 0.0050244 |
| umls:C0035372 | umls:C0035372 | Rett Syndrome | 2.113475 | 0.0326233 | 0.0050244 |
| umls:C0018817 | umls:C0018817 | Atrial Septal Defects | 1.988084 | 0.0306196 | 0.0050244 |
| umls:C0014544 | umls:C0014544 | Epilepsy | 1.881564 | 0.0267545 | 0.0050244 |
| ID | Description | NES | p.adjust | qvalues | |
|---|---|---|---|---|---|
| hsa04921 | hsa04921 | Oxytocin signaling pathway | 2.187167 | 0.0043745 | 0.0008781 |
| hsa04713 | hsa04713 | Circadian entrainment | 2.114976 | 0.0092065 | 0.0008781 |
| hsa04970 | hsa04970 | Salivary secretion | 2.031143 | 0.0334118 | 0.0016375 |
| hsa04261 | hsa04261 | Adrenergic signaling in cardiomyocytes | 2.015750 | 0.0177863 | 0.0010729 |
| hsa04728 | hsa04728 | Dopaminergic synapse | 1.973352 | 0.0177863 | 0.0010729 |
| ID | Description | NES | p.adjust | qvalues | |
|---|---|---|---|---|---|
| R-HSA-3214842 | R-HSA-3214842 | HDMs demethylate histones | 2.297315 | 0.0164876 | 0.0012422 |
| R-HSA-5578775 | R-HSA-5578775 | Ion homeostasis | 2.268130 | 0.0159153 | 0.0012422 |
| R-HSA-445095 | R-HSA-445095 | Interaction between L1 and Ankyrins | 2.229128 | 0.0167107 | 0.0012422 |
| R-HSA-73728 | R-HSA-73728 | RNA Polymerase I Promoter Opening | 2.223382 | 0.0166736 | 0.0012422 |
| R-HSA-2299718 | R-HSA-2299718 | Condensation of Prophase Chromosomes | 2.191772 | 0.0490282 | 0.0014944 |
Plots of the results when there are more than 2 terms in common between methods:
plot_shared_genes_GSEA(selected_modules[3], 5)
print_ORA_top_results(selected_modules[3], 5)
GO:
No enriched terms found
DO:
No enriched terms found
DGN:
No enriched terms found
KEGG:
No enriched terms found
Reactome:
No enriched terms found
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] xtable_1.8-4 kableExtra_1.1.0 knitr_1.32
## [4] doParallel_1.0.15 iterators_1.0.13 foreach_1.5.1
## [7] org.Hs.eg.db_3.8.2 AnnotationDbi_1.46.1 IRanges_2.18.3
## [10] S4Vectors_0.22.1 Biobase_2.44.0 BiocGenerics_0.30.0
## [13] DOSE_3.10.2 ReactomePA_1.28.0 clusterProfiler_3.12.0
## [16] biomaRt_2.40.5 polycor_0.7-10 expss_0.10.7
## [19] WGCNA_1.69 fastcluster_1.2.3 dynamicTreeCut_1.63-1
## [22] ggExtra_0.9 ggpubr_0.2.5 magrittr_2.0.1
## [25] GGally_1.5.0 corrplot_0.90 colorspace_2.0-2
## [28] gridExtra_2.3 viridis_0.6.1 viridisLite_0.4.0
## [31] RColorBrewer_1.1-2 dendextend_1.15.1 plotly_4.9.2
## [34] glue_1.4.2 reshape2_1.4.4 forcats_0.5.0
## [37] stringr_1.4.0 dplyr_1.0.1 purrr_0.3.4
## [40] readr_1.3.1 tidyr_1.1.0 tibble_3.1.2
## [43] ggplot2_3.3.5 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.1
## [3] RSQLite_2.2.0 htmlwidgets_1.5.3
## [5] grid_3.6.3 BiocParallel_1.18.1
## [7] munsell_0.5.0 codetools_0.2-16
## [9] preprocessCore_1.46.0 miniUI_0.1.1.1
## [11] withr_2.4.2 GOSemSim_2.10.0
## [13] highr_0.9 rstudioapi_0.13
## [15] ggsignif_0.6.2 labeling_0.4.2
## [17] urltools_1.7.3 GenomeInfoDbData_1.2.1
## [19] polyclip_1.10-0 bit64_4.0.5
## [21] farver_2.1.0 vctrs_0.3.8
## [23] generics_0.1.0 xfun_0.25
## [25] GenomeInfoDb_1.20.0 R6_2.5.1
## [27] graphlayouts_0.7.0 locfit_1.5-9.4
## [29] DelayedArray_0.10.0 bitops_1.0-7
## [31] cachem_1.0.6 reshape_0.8.8
## [33] fgsea_1.10.1 gridGraphics_0.5-1
## [35] assertthat_0.2.1 promises_1.2.0.1
## [37] scales_1.1.1 ggraph_2.0.3
## [39] nnet_7.3-14 enrichplot_1.4.0
## [41] gtable_0.3.0 tidygraph_1.2.0
## [43] rlang_0.4.11 genefilter_1.66.0
## [45] splines_3.6.3 lazyeval_0.2.2
## [47] acepack_1.4.1 impute_1.58.0
## [49] broom_0.7.0 europepmc_0.4
## [51] checkmate_2.0.0 BiocManager_1.30.16
## [53] yaml_2.2.1 modelr_0.1.6
## [55] crosstalk_1.1.1 backports_1.2.1
## [57] httpuv_1.6.1 qvalue_2.16.0
## [59] Hmisc_4.4-0 tools_3.6.3
## [61] ggplotify_0.1.0 ellipsis_0.3.2
## [63] jquerylib_0.1.4 ggridges_0.5.3
## [65] Rcpp_1.0.7 plyr_1.8.6
## [67] zlibbioc_1.30.0 base64enc_0.1-3
## [69] progress_1.2.2 RCurl_1.98-1.4
## [71] prettyunits_1.1.1 rpart_4.1-15
## [73] cowplot_1.1.1 SummarizedExperiment_1.14.1
## [75] haven_2.2.0 ggrepel_0.9.1
## [77] cluster_2.1.0 fs_1.5.0
## [79] data.table_1.14.0 DO.db_2.9
## [81] reactome.db_1.68.0 triebeard_0.3.0
## [83] reprex_0.3.0 matrixStats_0.60.1
## [85] hms_1.1.0 mime_0.11
## [87] evaluate_0.14 XML_3.99-0.3
## [89] jpeg_0.1-9 readxl_1.3.1
## [91] compiler_3.6.3 crayon_1.4.1
## [93] htmltools_0.5.2 later_1.3.0
## [95] Formula_1.2-4 geneplotter_1.62.0
## [97] lubridate_1.7.10 DBI_1.1.1
## [99] tweenr_1.0.2 dbplyr_1.4.2
## [101] rappdirs_0.3.3 MASS_7.3-53
## [103] Matrix_1.2-18 cli_3.0.1
## [105] igraph_1.2.6 GenomicRanges_1.36.1
## [107] pkgconfig_2.0.3 rvcheck_0.1.8
## [109] foreign_0.8-76 xml2_1.3.2
## [111] annotate_1.62.0 bslib_0.3.0
## [113] XVector_0.24.0 webshot_0.5.2
## [115] rvest_0.3.5 yulab.utils_0.0.2
## [117] digest_0.6.27 graph_1.62.0
## [119] rmarkdown_2.7 cellranger_1.1.0
## [121] fastmatch_1.1-3 htmlTable_1.13.3
## [123] curl_4.3.2 shiny_1.6.0
## [125] graphite_1.30.0 lifecycle_1.0.0
## [127] jsonlite_1.7.2 fansi_0.5.0
## [129] pillar_1.6.2 lattice_0.20-41
## [131] fastmap_1.1.0 httr_1.4.2
## [133] survival_3.2-7 GO.db_3.8.2
## [135] UpSetR_1.4.0 png_0.1-7
## [137] bit_4.0.4 ggforce_0.3.1
## [139] stringi_1.7.4 sass_0.4.0
## [141] blob_1.2.2 DESeq2_1.24.0
## [143] latticeExtra_0.6-29 memoise_2.0.0